Lab #2 - Perceptron Classification [AMEO]


Author - Aman Hussain

Reg. - 15BCE1077

Email - aman.hussain2015@vit.ac.in


Import Libraries

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
In [2]:
from sklearn.preprocessing import normalize

Load the data

In [3]:
graduate_data = pd.read_csv('../Data/processed/X.csv')
graduate_data.tail()
Out[3]:
10percentage 12percentage collegeGPA English Logical Quant Domain ComputerProgramming ElectronicsAndSemicon ComputerScience ... Specialization_EC Specialization_EL Specialization_ME Specialization_other CollegeTier_1 CollegeTier_2 Degree_B.Tech/B.E. Degree_M.Sc. (Tech.) Degree_M.Tech./M.E. Degree_MCA
3993 52.09 55.50 61.50 365 334 475 0.276047 345 0 0 ... 0 0 0 0 0 1 1 0 0 0
3994 90.00 93.00 77.30 415 410 535 0.881620 325 420 0 ... 1 0 0 0 0 1 1 0 0 0
3995 81.86 65.50 70.00 475 475 465 0.488348 405 0 0 ... 0 0 0 0 0 1 1 0 0 0
3996 78.72 69.88 70.42 450 410 320 0.744758 445 0 438 ... 0 0 0 0 0 1 1 0 0 0
3997 70.60 68.00 68.00 565 515 464 0.600057 435 0 0 ... 0 0 0 0 0 1 1 0 0 0

5 rows × 70 columns

In [4]:
salary_category = pd.read_csv('../Data/processed/y_cat.csv')
salary_category.head()
Out[4]:
Salary
0 (370000, 4000000]
1 (370000, 4000000]
2 (300000, 370000]
3 (370000, 4000000]
4 (180000, 300000]

Data Analysis

In [5]:
data = graduate_data
data['Salary'] = salary_category['Salary']
In [6]:
graduate_data.columns
Out[6]:
Index(['10percentage', '12percentage', 'collegeGPA', 'English', 'Logical',
       'Quant', 'Domain', 'ComputerProgramming', 'ElectronicsAndSemicon',
       'ComputerScience', 'MechanicalEngg', 'ElectricalEngg', 'TelecomEngg',
       'CivilEngg', 'conscientiousness', 'agreeableness', 'extraversion',
       'nueroticism', 'openess_to_experience', 'CollegeState_Andhra Pradesh',
       'CollegeState_Assam', 'CollegeState_Bihar', 'CollegeState_Chhattisgarh',
       'CollegeState_Delhi', 'CollegeState_Goa', 'CollegeState_Gujarat',
       'CollegeState_Haryana', 'CollegeState_Himachal Pradesh',
       'CollegeState_Jammu and Kashmir', 'CollegeState_Jharkhand',
       'CollegeState_Karnataka', 'CollegeState_Kerala',
       'CollegeState_Madhya Pradesh', 'CollegeState_Maharashtra',
       'CollegeState_Meghalaya', 'CollegeState_Orissa', 'CollegeState_Punjab',
       'CollegeState_Rajasthan', 'CollegeState_Sikkim',
       'CollegeState_Tamil Nadu', 'CollegeState_Telangana',
       'CollegeState_Union Territory', 'CollegeState_Uttar Pradesh',
       'CollegeState_Uttarakhand', 'CollegeState_West Bengal', 'Gender_f',
       'Gender_m', 'GraduationYear_0', 'GraduationYear_2007',
       'GraduationYear_2009', 'GraduationYear_2010', 'GraduationYear_2011',
       'GraduationYear_2012', 'GraduationYear_2013', 'GraduationYear_2014',
       'GraduationYear_2015', 'GraduationYear_2016', 'GraduationYear_2017',
       'Specialization_CE', 'Specialization_CS', 'Specialization_EC',
       'Specialization_EL', 'Specialization_ME', 'Specialization_other',
       'CollegeTier_1', 'CollegeTier_2', 'Degree_B.Tech/B.E.',
       'Degree_M.Sc. (Tech.)', 'Degree_M.Tech./M.E.', 'Degree_MCA', 'Salary'],
      dtype='object')
In [7]:
features = ['10percentage', '12percentage', 'collegeGPA', 'English',
            'Logical','Quant', 'Domain', 'conscientiousness', 'agreeableness',
            'extraversion', 'nueroticism', 'openess_to_experience','Specialization_CE',
            'Specialization_CS', 'Specialization_EC', 'Specialization_EL', 
            'Specialization_ME', 'Specialization_other', 'CollegeTier_1', 
            'CollegeTier_2', 'Degree_B.Tech/B.E.','Degree_M.Sc. (Tech.)', 
            'Degree_M.Tech./M.E.', 'Degree_MCA']
In [8]:
sns.pairplot(data,
             vars=['10percentage', '12percentage', 'collegeGPA', 'English','Logical','Quant', 'Domain'],
             hue='Salary')
Out[8]:
<seaborn.axisgrid.PairGrid at 0x7fc736ab38d0>
In [9]:
sns.pairplot(data,
             vars=['conscientiousness', 'agreeableness', 'extraversion', 'nueroticism', 'openess_to_experience'],
             hue='Salary')
Out[9]:
<seaborn.axisgrid.PairGrid at 0x7fc7309ffd30>
In [10]:
sns.pairplot(data,
             vars=['Specialization_CE', 'Specialization_CS', 'Specialization_EC',
                   'Specialization_EL', 'Specialization_ME', 'Specialization_other'],
             hue='Salary')
Out[10]:
<seaborn.axisgrid.PairGrid at 0x7fc731a9be10>
In [11]:
sns.pairplot(data,
             vars=['CollegeTier_1', 'CollegeTier_2', 'Degree_B.Tech/B.E.', 
                   'Degree_M.Sc. (Tech.)', 'Degree_M.Tech./M.E.', 'Degree_MCA'],
             hue='Salary')
Out[11]:
<seaborn.axisgrid.PairGrid at 0x7fc727578860>
In [12]:
sns.pairplot(data,
             vars=['conscientiousness', 'agreeableness', 'extraversion', 'nueroticism', 'openess_to_experience',
                   'collegeGPA', 'English','Logical','Quant', 'Domain'],
             hue='Salary')
Out[12]:
<seaborn.axisgrid.PairGrid at 0x7fc725371f28>

The data is not linearly separable, no matter how we look at it. Hence, we can be pretty sure that a single Perceptron classifier will do a bad job. However, for demonstration and visualization purposes we choose the two features: 'domain' & 'agreeableness'.

In [13]:
salary_freq = salary_category['Salary'].value_counts()
salary_1 = salary_freq['(180000, 300000]']
total = salary_freq.sum()
print('Percentage of (180000, 300000]: {:.2f}'.format(salary_1 / total))
Percentage of (180000, 300000]: 0.29

Feature Engineering

In [14]:
del data
In [15]:
X = graduate_data.as_matrix(columns=['Domain', 'agreeableness'])
y = salary_category.as_matrix()
In [16]:
y1 = np.where(y == '(180000, 300000]', -1, 1).ravel()
y2 = np.where(y == '[35000, 180000]', -1, 1).ravel()
y3 = np.where(y == '(370000, 4000000]', -1, 1).ravel()
y4 = np.where(y == '(300000, 370000]', -1, 1).ravel()
y = [y1, y2, y3, y4]

Perceptron

In [17]:
class Perceptron(object):
    """Perceptron classifier.

    Parameters
    ------------
    eta : float
        Learning rate (between 0.0 and 1.0)
    n_iter : int
        Passes over the training dataset.

    Attributes
    -----------
    w_ : 1d-array
        Weights after fitting.
    errors_ : list
        Number of misclassifications in every epoch.

    """
    def __init__(self, eta=0.01, n_iter=10):
        self.eta = eta
        self.n_iter = n_iter

    def fit(self, X, y):
        """Fit training data.

        Parameters
        ----------
        X : {array-like}, shape = [n_samples, n_features]
            Training vectors, where n_samples is the number of samples and
            n_features is the number of features.
        y : array-like, shape = [n_samples]
            Target values.

        Returns
        -------
        self : object

        """
        self.w_ = np.zeros(1 + X.shape[1])
        self.errors_ = []

        for _ in range(self.n_iter):
            errors = 0
            for xi, target in zip(X, y):
                update = self.eta * (target - self.predict(xi))
                self.w_[1:] += update * xi
                self.w_[0] += update
                errors += int(update != 0.0)
            self.errors_.append(errors)
        return self

    def net_input(self, X):
        """Calculate net input"""
        return np.dot(X, self.w_[1:]) + self.w_[0]

    def activation(self, X):
        """Compute linear activation"""
        return self.net_input(X)

    def predict(self, X):
        """Return class label after unit step"""
        return np.where(self.activation(X) >= 0.0, 1, -1)

Train the perceptrons

In [18]:
clf1 = Perceptron(eta=0.001, n_iter=1000)
clf2 = Perceptron(eta=0.001, n_iter=1000)
clf3 = Perceptron(eta=0.001, n_iter=1000)
clf4 = Perceptron(eta=0.001, n_iter=1000)

clf = [clf1, clf2, clf3, clf4]
In [19]:
for i in range(4):
    clf[i].fit(X, y[i])

Plot the training session

In [20]:
fig = plt.figure(figsize=(10,10))
for i in range(4):
    ax = fig.add_subplot(2,2, i+1)
    ax.plot(range(1, len(clf[i].errors_) + 1), clf[i].errors_, marker='.')
    ax.set_xlabel('Epochs')
    ax.set_ylabel('Misclassifications')

Visualizing the Perceptron

In [21]:
from matplotlib.colors import ListedColormap
In [22]:
def plot_decision_regions(X, y, classifier, resolution=0.02, ax=plt.figure()):
    # setup marker generator and color map
    markers = ('s', 'x', 'o', '^', 'v')
    colors = ('red', 'blue', 'lightgreen', 'gray', 'cyan')
    cmap = ListedColormap(colors[:len(np.unique(y))])

    # plot the decision surface
    x1_min, x1_max = X[:, 0].min() - 1, X[:, 0].max() + 1
    x2_min, x2_max = X[:, 1].min() - 1, X[:, 1].max() + 1
    xx1, xx2 = np.meshgrid(np.arange(x1_min, x1_max, resolution),
                         np.arange(x2_min, x2_max, resolution))
    Z = classifier.predict(np.array([xx1.ravel(), xx2.ravel()]).T)
    Z = Z.reshape(xx1.shape)
    
    ax.contourf(xx1, xx2, Z, alpha=0.4, cmap=cmap)
    ax.set_xlim(xx1.min(), xx1.max())
    ax.set_ylim(xx2.min(), xx2.max())

    # plot class samples
    for idx, cl in enumerate(np.unique(y)):
        ax.scatter(x=X[np.where(y == cl), 0], y=X[np.where(y == cl), 1],
                    alpha=0.8, c=cmap(idx),
                    marker=markers[idx], label=cl)
<matplotlib.figure.Figure at 0x7fc71b67b978>
In [25]:
fig = plt.figure(figsize=(10,10))
for i in range(4):
    ax = fig.add_subplot(2,2, i+1)
    plot_decision_regions(X=X, y=y1, classifier=clf[i],ax=ax)
    ax.set_xlabel('Domain')
    ax.set_ylabel('Agreeableness')
    ax.legend(loc='upper left')

Conclusion

Hence, we see that a Perceptron fails to classify the data as it is not linearly separable.